Note: This code is quite memory intensive due to the Google News dataset that is used. About 9 GB of RAM is used.
In [33]:
import re
from gensim import models
from scipy import spatial
import numpy as np
import os.path
import urllib
import gzip
import json
import pandas as pd
In [18]:
def search_tags(entity, search):
"""
This function searches through all the 'tags' (semantic content) of a data set
and returns 'true' if the search expression is found. case insensitive.
"""
all_tags = '; '.join([str(x) for x in entity['tags'].values()])
return bool(re.search(search, all_tags, flags=re.IGNORECASE))
In [14]:
def gunzipFile(inFileName, outFileName):
inF = gzip.open(inFileName, 'rb')
outF = open(outFileName, 'wb')
outF.write( inF.read() )
inF.close()
outF.close()
In [3]:
# the idea for this code comes from this blog post:
# http://sujitpal.blogspot.nl/2015/09/sentence-similarity-using-word2vec-and.html
def sentenceDistance(sent1, sent2, stoplist):
# remove all non-alphanumeric characters
sent1 = re.sub('[^0-9a-zA-Z]+', ' ', sent1)
sent2 = re.sub('[^0-9a-zA-Z]+', ' ', sent2)
# split up the sentences into tokens, convert to lower case, and remove stopwords
tokens1 = [word for word in sent1.lower().split() if word not in stoplist]
tokens2 = [word for word in sent2.lower().split() if word not in stoplist]
# get unique tokens
tokens1 = list(set(tokens1))
tokens2 = list(set(tokens2))
# Need to get the shortest distances from all words in sent1 to a word in sent2
# If there are matching words, then the distance is 0
# If a synonym was found, then the distance should be small
# The sum of these shortest distances for all words in sent1 is then returned as totalDist
totalDist = 9999
for token1 in tokens1:
if model.vocab.has_key(token1):
minDist = 9999
for token2 in tokens2:
if model.vocab.has_key(token2):
lv = model[token1]
rv = model[token2]
dist = spatial.distance.cosine(lv, rv)
# instead of cosine distance can also try euclidean distance
#dist = spatial.distance.euclidean(lv, rv)
if dist < minDist:
minDist = dist
if minDist < 9999:
if totalDist == 9999:
totalDist = minDist
else:
totalDist = totalDist + minDist
return(totalDist)
Load in the stopwords file. These are common words which we wish to exclude when performing comparisons (a, an, the, etc). Every word is separated by a new line.
In [4]:
stopWordsFile = "en.txt"
with open(stopWordsFile) as f:
stoplist = [x.strip('\n') for x in f.readlines()]
We need to check if we have the word2vec model which has been pre-trained on the Google News corpus. The vectors are 300 dimentions and this is generated with a training set involving over 100 billion words
Note: This file is 1.6 GB compressed and expands to 3.4 GB
In [10]:
if os.path.isfile("GoogleNews-vectors-negative300.bin.gz") == False:
# This is the direct download link for GoogleNews-vectors-negative300.bin.gz
# If the link changes, just search for the filename as this is a file often used for word2vec
downloadURL = 'https://doc-0g-8s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/dhu4deogg9hg0tkm9tdann504ue0vp91/1461232800000/06848720943842814915/*/0B7XkCwpI5KDYNlNUTTlSS21pQmM?e=download'
urllib.urlretrieve (downloadURL, "GoogleNews-vectors-negative300.bin.gz")
Unzip the file. This may take a few several minutes due to the python gzip library. It may be quicker to just do this from the command line or do a system call.
In [11]:
if os.path.isfile("GoogleNews-vectors-negative300.bin") == False:
gunzipFile('GoogleNews-vectors-negative300.bin.gz', 'GoogleNews-vectors-negative300.bin')
Create a model using this pre-trained data set
In [12]:
model = models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
Load in the data from the catalog
In [34]:
# http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
# need this to deal with unicode errors
def byteify(input):
if isinstance(input, dict):
return {byteify(key): byteify(value)
for key, value in input.iteritems()}
elif isinstance(input, list):
return [byteify(element) for element in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
gunzipFile('../catalogs/gabi_2016_professional-database-2016.json.gz',
'../catalogs/gabi_2016_professional-database-2016.json')
gunzipFile('../catalogs/uslci_ecospold.json.gz',
'../catalogs/uslci_ecospold.json')
with open('../catalogs/gabi_2016_professional-database-2016.json') as data_file:
gabi = json.load(data_file, encoding='utf-8')
with open('../catalogs/uslci_ecospold.json') as data_file:
uslci = json.load(data_file, encoding='utf-8')
gabi = byteify(gabi)
uslci = byteify(uslci)
In [78]:
roundwood = [flow for flow in uslci['flows'] if search_tags(flow,'roundwood, softwood')]
roundwoodExample = roundwood[0]
# number of top scores to show
numTopScores = 10
flowNames = []
distValues = []
for flow in gabi['archives'][0]['flows']:
name = flow['tags']['Name']
flowNames.append(name)
dist = sentenceDistance(roundwoodExample['tags']['Name'], name, stoplist)
distValues.append(dist)
len(flowNames)
# figure out top scores
arr = np.array(distValues)
topIndices = arr.argsort()[0:numTopScores]
topScores = np.array(distValues)[topIndices]
print 'Process name to match:'
print roundwoodExample['tags']['Name']
print 'Matches using Word2Vec:'
for i, s in zip(topIndices, topScores):
if s < 9999:
print(flowNames[i],s)